library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.1 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(ggrepel)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
netflix <- read.csv('netflix.csv', na.strings = c("", "NA"), stringsAsFactors = FALSE)
#check missing data
sum(is.na(netflix))
## [1] 3631
colSums(is.na(netflix))
## show_id type title director cast country
## 0 0 0 2389 718 507
## date_added release_year rating duration listed_in description
## 10 0 7 0 0 0
## change categorical variable, from character to factor
netflix$rating <- as.factor(netflix$rating)
netflix$listed_in <- as.factor(netflix$listed_in)
netflix$type <- as.factor(netflix$type)
netflix$date_added2 <- mdy(netflix$date_added)#convert the date the movie was added to date time
netflix <- netflix %>% #create a column for the month and week day
mutate(month = month(date_added2, label = TRUE, abbr = FALSE),
day = wday(date_added2, label = TRUE, abbr = FALSE))
netflix$type <- factor(netflix$type,
levels = c("TV Show", "Movie"))
netflix$month<- factor(netflix$month,
levels = c("January", "February",
"March", "April", "May",
"June", "July", "August",
"September", "October",
"November", "December"))
netflix$day <- factor(netflix$day,
levels = c("Monday", "Tuesday",
"Wednesday", "Thursday",
"Friday", "Saturday",
"Sunday"))
#plot showing months with highest release
netflix %>%
filter(!is.na(month)) %>%
group_by(month) %>%
summarize(count = n()) %>%
ggplot(aes(x =reorder(month, count), y = count, fill = month)) +
geom_bar(stat = "identity") +
xlab("months of the year") +
ylab("Number of Movies") +
ggtitle("Number of movies released by year") +
coord_flip()
netflix %>%
filter(!is.na(day)) %>%
group_by(day) %>%
summarize(count = n()) %>%
ggplot(aes(x = reorder(day, count), y = count, fill = day)) +
geom_bar(stat = "identity") +
xlab("days of the week") +
ylab("Number of Movies") +
ggtitle("Number of movies released by days of the week") +
coord_flip()
##change the date format
head(netflix$date_added)
## [1] "August 14, 2020" "December 23, 2016" "December 20, 2018"
## [4] "November 16, 2017" "January 1, 2020" "July 1, 2017"
netflix$date_added <- mdy(netflix$date_added)
head(netflix$date_added)
## [1] "2020-08-14" "2016-12-23" "2018-12-20" "2017-11-16" "2020-01-01"
## [6] "2017-07-01"
head(netflix)
## show_id type title director
## 1 s1 TV Show 3% <NA>
## 2 s2 Movie 7:19 Jorge Michel Grau
## 3 s3 Movie 23:59 Gilbert Chan
## 4 s4 Movie 9 Shane Acker
## 5 s5 Movie 21 Robert Luketic
## 6 s6 TV Show 46 Serdar Akar
## cast
## 1 João Miguel, Bianca Comparato, Michel Gomes, Rodolfo Valente, Vaneza Oliveira, Rafael Lozano, Viviane Porto, Mel Fronckowiak, Sergio Mamberti, Zezé Motta, Celso Frateschi
## 2 Demián Bichir, Héctor Bonilla, Oscar Serrano, Azalia Ortiz, Octavio Michel, Carmen Beato
## 3 Tedd Chan, Stella Chung, Henley Hii, Lawrence Koh, Tommy Kuan, Josh Lai, Mark Lee, Susan Leong, Benjamin Lim
## 4 Elijah Wood, John C. Reilly, Jennifer Connelly, Christopher Plummer, Crispin Glover, Martin Landau, Fred Tatasciore, Alan Oppenheimer, Tom Kane
## 5 Jim Sturgess, Kevin Spacey, Kate Bosworth, Aaron Yoo, Liza Lapira, Jacob Pitts, Laurence Fishburne, Jack McGee, Josh Gad, Sam Golzari, Helen Carey, Jack Gilpin
## 6 Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan, Saygın Soysal, Berkan Şal, Metin Belgin, Ayça Eren, Selin Uludoğan, Özay Fecht, Suna Yıldızoğlu
## country date_added release_year rating duration
## 1 Brazil 2020-08-14 2020 TV-MA 4 Seasons
## 2 Mexico 2016-12-23 2016 TV-MA 93 min
## 3 Singapore 2018-12-20 2011 R 78 min
## 4 United States 2017-11-16 2009 PG-13 80 min
## 5 United States 2020-01-01 2008 PG-13 123 min
## 6 Turkey 2017-07-01 2016 TV-MA 1 Season
## listed_in
## 1 International TV Shows, TV Dramas, TV Sci-Fi & Fantasy
## 2 Dramas, International Movies
## 3 Horror Movies, International Movies
## 4 Action & Adventure, Independent Movies, Sci-Fi & Fantasy
## 5 Dramas
## 6 International TV Shows, TV Dramas, TV Mysteries
## description
## 1 In a future where the elite inhabit an island paradise far from the crowded slums, you get one chance to join the 3% saved from squalor.
## 2 After a devastating earthquake hits Mexico City, trapped survivors from all walks of life wait to be rescued while trying desperately to stay alive.
## 3 When an army recruit is found dead, his fellow soldiers are forced to confront a terrifying secret that's haunting their jungle island training camp.
## 4 In a postapocalyptic world, rag-doll robots hide in fear from dangerous machines out to exterminate them, until a brave newcomer joins the group.
## 5 A brilliant group of students become card-counting experts with the intent of swindling millions out of Las Vegas casinos by playing blackjack.
## 6 A genetics professor experiments with a treatment for his comatose sister that blends medical and shamanic cures, but unlocks a shocking side effect.
## date_added2 month day
## 1 2020-08-14 August Friday
## 2 2016-12-23 December Friday
## 3 2018-12-20 December Thursday
## 4 2017-11-16 November Thursday
## 5 2020-01-01 January Wednesday
## 6 2017-07-01 July Saturday
summary(netflix)
## show_id type title director
## Length:7787 TV Show:2410 Length:7787 Length:7787
## Class :character Movie :5377 Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## cast country date_added release_year
## Length:7787 Length:7787 Min. :2008-01-01 Min. :1925
## Class :character Class :character 1st Qu.:2018-02-01 1st Qu.:2013
## Mode :character Mode :character Median :2019-03-08 Median :2017
## Mean :2019-01-02 Mean :2014
## 3rd Qu.:2020-01-20 3rd Qu.:2018
## Max. :2021-01-16 Max. :2021
## NA's :10
## rating duration
## TV-MA :2863 Length:7787
## TV-14 :1931 Class :character
## TV-PG : 806 Mode :character
## R : 665
## PG-13 : 386
## (Other):1129
## NA's : 7
## listed_in description
## Documentaries : 334 Length:7787
## Stand-Up Comedy : 321 Class :character
## Dramas, International Movies : 320 Mode :character
## Comedies, Dramas, International Movies : 243
## Dramas, Independent Movies, International Movies: 215
## Kids' TV : 205
## (Other) :6149
## date_added2 month day
## Min. :2008-01-01 December: 833 Friday :2287
## 1st Qu.:2018-02-01 October : 785 Thursday :1147
## Median :2019-03-08 January : 757 Tuesday :1070
## Mean :2019-01-02 November: 738 Wednesday:1020
## 3rd Qu.:2020-01-20 March : 669 Monday : 814
## Max. :2021-01-16 (Other) :3995 (Other) :1439
## NA's :10 NA's : 10 NA's : 10
glimpse(netflix)
## Rows: 7,787
## Columns: 15
## $ show_id <chr> "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s1…
## $ type <fct> TV Show, Movie, Movie, Movie, Movie, TV Show, Movie, Movi…
## $ title <chr> "3%", "7:19", "23:59", "9", "21", "46", "122", "187", "70…
## $ director <chr> NA, "Jorge Michel Grau", "Gilbert Chan", "Shane Acker", "…
## $ cast <chr> "João Miguel, Bianca Comparato, Michel Gomes, Rodolfo Val…
## $ country <chr> "Brazil", "Mexico", "Singapore", "United States", "United…
## $ date_added <date> 2020-08-14, 2016-12-23, 2018-12-20, 2017-11-16, 2020-01-…
## $ release_year <int> 2020, 2016, 2011, 2009, 2008, 2016, 2019, 1997, 2019, 200…
## $ rating <fct> TV-MA, TV-MA, R, PG-13, PG-13, TV-MA, TV-MA, R, TV-14, TV…
## $ duration <chr> "4 Seasons", "93 min", "78 min", "80 min", "123 min", "1 …
## $ listed_in <fct> "International TV Shows, TV Dramas, TV Sci-Fi & Fantasy",…
## $ description <chr> "In a future where the elite inhabit an island paradise f…
## $ date_added2 <date> 2020-08-14, 2016-12-23, 2018-12-20, 2017-11-16, 2020-01-…
## $ month <ord> August, December, December, November, January, July, June…
## $ day <ord> Friday, Friday, Thursday, Thursday, Wednesday, Saturday, …
table(netflix$type)
##
## TV Show Movie
## 2410 5377
netflix %>%
count(type) %>%
ggplot() +
geom_col(aes(x = type, y = n, fill = type)) +
labs(title = "Show Types") +
theme_minimal()
###
netflix %>% count(type, sort = T) %>%
mutate(prop = paste0(round(n / sum(n) * 100, 0), "%")) %>%
ggplot(aes(x = "", y = prop, fill = type)) +
geom_bar(
stat = "identity",
width = 1,
color = "steelblue",
size = 1
) +
coord_polar("y", start = 0) +
geom_text(
aes(y = prop, label = prop),
position = position_stack(vjust = 0.5),
size = 6,
col = "white",
fontface = "bold"
) +
scale_fill_manual (values = c('#e41a1c', '#377eb8')) +
theme_void() +
labs(
title = 'Proportion of Movies to TV shows',
fill = ""
)
# netflix %>% filter(title=="Black Mirror: Bandersnatch")
library(broom)
movies<- netflix %>% select(country, type, duration, rating, title) %>%
filter(type == "Movie") %>%
drop_na() %>%
mutate(duration_min = parse_number(duration))
movies %>%
select(title, duration_min) %>%
filter(duration_min > 200) %>%
arrange(desc(duration_min))
## title duration_min
## 1 Black Mirror: Bandersnatch 312
## 2 The School of Mischief 253
## 3 No Longer kids 237
## 4 Sangam 228
## 5 Lagaan 224
## 6 Jodhaa Akbar 214
## 7 Kabhi Khushi Kabhie Gham 209
## 8 The Irishman 209
## 9 No Direction Home: Bob Dylan 208
## 10 The Gospel of Luke 205
## 11 What's Your Raashee? 203
## 12 The Lord of the Rings: The Return of the King 201
movies %>%
select(title, duration_min) %>%
arrange(desc(duration_min)) %>%
ggplot () +
geom_histogram(aes(x=duration_min),fill='dark red') +
labs(title='Distribution of Movie Duration')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###
tv_show <- netflix %>% select(country, type, duration, rating, title) %>%
filter(type == "TV Show") %>%
drop_na() %>%
mutate(duration_season = parse_number(duration))
tv_show %>%
select(title, duration_season) %>%
filter(duration_season > 10) %>%
arrange(desc(duration_season))
## title duration_season
## 1 Grey's Anatomy 16
## 2 NCIS 15
## 3 Supernatural 15
## 4 COMEDIANS of the world 13
## 5 Red vs. Blue 13
## 6 Criminal Minds 12
## 7 Trailer Park Boys 12
## 8 Cheers 11
## 9 Frasier 11
## 10 Heartland 11
tv_show %>%
select(title, duration_season) %>%
arrange(desc(duration_season))%>%
ggplot () +
geom_histogram(aes(x=duration_season),fill='dark blue',size=3) +
labs(title='Distribution of TV Shows Duration')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
tv_show %>% select(duration_season) %>%
count(duration_season, sort = TRUE) %>%
ggplot(aes(
x = as.factor(duration_season),
y = n,
label = n
)) +
geom_col(aes(fill = duration_season)) +
geom_text(vjust = -0.5, size = 3, col = "darkblue") +
theme_light() +
theme(legend.position = "none") +
labs(x = "Season duration",
y = "Count",
title = "Distribution of TV Shows Duration",
fill = ""
)
####
netflix %>%
filter(type == 'Movie') %>%
mutate(duration_in_mins=parse_number(duration))%>%
summarise(mean_duration=mean(duration_in_mins))
## mean_duration
## 1 99.30798
netflix %>%
filter(type == 'Movie') %>%
mutate(duration = parse_number(duration)) %>%
summarise(mean_duration = mean(duration),
median_duration = median(duration))
## mean_duration median_duration
## 1 99.30798 98
netflix %>%
filter(type == 'Movie') %>%
mutate(duration = parse_number(duration)) %>%
ggplot () +
geom_histogram (aes(x=duration),fill='dark blue') +
labs(title='Distribution of Movie Duration')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
netflix %>%
filter(!str_detect(country,',')) %>%
group_by(country) %>%
count() %>%
arrange(desc(n)) %>%
head(20) %>%
ggplot() + geom_col(aes(y = reorder(country,n), x = n, fill=reorder(country,n)), width = 0.4) +
geom_label(aes(y = reorder(country,n), x = n, label = n)) +
labs(title = 'Number of Shows of each Country')
#netflix$date_added <- as.Date(netflix$date_added, format = "%B %d, %Y")
# Number of shows available in Netflix as a function of time
df_by_date <- netflix %>%
group_by(date_added,type) %>%
summarise(addedToday = n()) %>%
ungroup() %>% group_by(type) %>%
mutate(Total_Number_of_Shows = cumsum(addedToday),
label = if_else(date_added == max(date_added,na.rm = T),
as.character(type), NA_character_))
## `summarise()` has grouped output by 'date_added'. You can override using the `.groups` argument.
df_by_date %>% ggplot(aes(x = date_added, y = Total_Number_of_Shows, color = type)) +
geom_line() +
theme_bw() +
scale_x_date(date_breaks = '2 years', date_labels = "%Y") +
theme(legend.position = 'none') +
geom_text_repel(aes(label = label), size = 8,na.rm = TRUE, nudge_y = 100)+
labs(title = "Number of Shows in Netflix by Time",
x = "years", y = "number of shows") +
theme_minimal()
## Warning: Removed 1 row(s) containing missing values (geom_path).
Which countries are producing most shows ?
netflix %>%
filter(country!='NA') %>%
group_by(type) %>%
mutate(country = fct_infreq(country))%>%
count(country)
## # A tibble: 773 x 3
## # Groups: type [2]
## type country n
## <fct> <fct> <int>
## 1 TV Show United States 705
## 2 TV Show United Kingdom 204
## 3 TV Show Japan 157
## 4 TV Show South Korea 147
## 5 TV Show India 71
## 6 TV Show Taiwan 68
## 7 TV Show Canada 59
## 8 TV Show Australia 46
## 9 TV Show France 46
## 10 TV Show Spain 45
## # … with 763 more rows
netflix %>%
filter(country!='NA') %>%
group_by(type) %>%
mutate(country = fct_infreq(country)) %>%
ggplot(aes(x = country, fill=type)) +
geom_histogram(stat = 'count') +
facet_wrap(~type, scales = 'free_x') +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_cartesian(xlim = c(1,10)) +
scale_x_discrete(labels = function(x){str_wrap(x,20)}, breaks = function(x) {x[1:10]})
## Warning: Ignoring unknown parameters: binwidth, bins, pad
##From the above, we can see that: After United States, India is the largest source of Movies listed on Netflix, and UK is the third. While many of the Indian movies ended up on Netflix, it looks like there are not as many Indian TV shows. A large number of listings also seem to have missing country information, which could possibly be inferred from the cast etc.
show_categories <- netflix %>%
select(c('show_id','type','listed_in')) %>%
separate_rows(listed_in, sep = ',') %>%
rename(Show_Category = listed_in)
show_categories$Show_Category <- trimws(show_categories$Show_Category)
head(show_categories)
## # A tibble: 6 x 3
## show_id type Show_Category
## <chr> <fct> <chr>
## 1 s1 TV Show International TV Shows
## 2 s1 TV Show TV Dramas
## 3 s1 TV Show TV Sci-Fi & Fantasy
## 4 s2 Movie Dramas
## 5 s2 Movie International Movies
## 6 s3 Movie Horror Movies
show_categories %>%
mutate(Show_Category = fct_infreq(Show_Category)) %>%
group_by(type)%>%
count(Show_Category)
## # A tibble: 42 x 3
## # Groups: type [2]
## type Show_Category n
## <fct> <fct> <int>
## 1 TV Show International TV Shows 1199
## 2 TV Show TV Dramas 704
## 3 TV Show TV Comedies 525
## 4 TV Show Crime TV Shows 427
## 5 TV Show Kids' TV 414
## 6 TV Show Docuseries 353
## 7 TV Show Romantic TV Shows 333
## 8 TV Show British TV Shows 232
## 9 TV Show Reality TV 222
## 10 TV Show Korean TV Shows 150
## # … with 32 more rows
show_categories %>%
mutate(Show_Category = fct_infreq(Show_Category)) %>%
ggplot(aes(x = Show_Category, fill=type)) +
geom_bar() +
scale_x_discrete() +
facet_wrap(~type, scales = 'free_x') +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme() + coord_cartesian(xlim = c(1,20))
##############
df_unique_categories <- show_categories %>%
group_by(type,Show_Category) %>%
summarise()
## `summarise()` has grouped output by 'type'. You can override using the `.groups` argument.
df_category_correlations_movies <-
data.frame(expand_grid(type = 'Movie',
Category1 = subset(df_unique_categories, type == 'Movie')$Show_Category,
Category2 = subset(df_unique_categories, type == 'Movie')$Show_Category))
df_category_correlations_TV <-
data.frame(expand_grid(type = 'TV Show',
Category1 = subset(df_unique_categories, type == 'TV Show')$Show_Category,
Category2 = subset(df_unique_categories, type == 'TV Show')$Show_Category))
df_category_correlations <- rbind(df_category_correlations_movies,df_category_correlations_TV)
df_category_correlations$matched_count <-
apply(df_category_correlations, MARGIN = 1,FUN = function(x)
{
length(intersect(subset(show_categories,type == x['type'] & Show_Category == x['Category1'])$show_id,
subset(show_categories, type == x['type'] & Show_Category == x['Category2'])$show_id))
})
df_category_correlations <-
subset(df_category_correlations, (as.character(Category1) < as.character(Category2)) & (matched_count > 0))
# Change plot size to 8 x 3
options(repr.plot.width=14, repr.plot.height=10)
ggplot(subset(df_category_correlations, type == 'Movie'),
aes(x = Category1, y = Category2, fill = matched_count)) +
geom_tile() + facet_wrap( ~type, scales = 'free') +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_distiller(palette = "Spectral") +
theme(legend.text = element_text(size = 14), legend.title = element_text(size = 16))
netflix %>%
select(listed_in) %>%
mutate(listed_in = str_split(listed_in,',')) %>%
unnest(listed_in) %>%
group_by(listed_in) %>%
count() %>%
arrange(desc(n)) %>%
head(30)
## # A tibble: 30 x 2
## # Groups: listed_in [30]
## listed_in n
## <chr> <int>
## 1 " International Movies" 2323
## 2 "Dramas" 1384
## 3 "Comedies" 1074
## 4 "Documentaries" 751
## 5 " Dramas" 722
## 6 "Action & Adventure" 721
## 7 "International TV Shows" 690
## 8 " Independent Movies" 653
## 9 " TV Dramas" 642
## 10 " Romantic Movies" 528
## # … with 20 more rows
netflix %>%
select(listed_in) %>%
mutate(listed_in = str_split(listed_in,',')) %>%
unnest(listed_in) %>%
group_by(listed_in) %>%
count() %>%
arrange(desc(n)) %>%
head(30) %>%
ggplot() + geom_col(aes(y = reorder(listed_in,n), x = n, fill=n)) +
labs(title = 'Category of Movie and TV Shows',
x = 'Count',
y = 'Genre') +
geom_label(aes(y = reorder(listed_in,n), x = n, label = n),size=2) +
theme_minimal()
netflix %>%
select(c('show_id','cast','director')) %>%
gather(key = 'role', value = 'person', cast, director) %>%
filter(person != "") %>%
separate_rows(person, sep = ',') -> show_people
show_people$person <- trimws(show_people$person)
head(show_people)
## # A tibble: 6 x 3
## show_id role person
## <chr> <chr> <chr>
## 1 s1 cast João Miguel
## 2 s1 cast Bianca Comparato
## 3 s1 cast Michel Gomes
## 4 s1 cast Rodolfo Valente
## 5 s1 cast Vaneza Oliveira
## 6 s1 cast Rafael Lozano
people_freq <- show_people %>%
group_by(person,role) %>%
summarise(count = n()) %>%
arrange(desc(count))
## `summarise()` has grouped output by 'person'. You can override using the `.groups` argument.
people_freq %>%
group_by(role) %>%
top_n(10,count) %>%
ungroup() %>%
ggplot(aes(x = fct_reorder(person,count,.desc = T), y = count, fill = role)) +
geom_bar(stat = 'identity') +
scale_x_discrete() + facet_wrap(~role, scales = 'free_x') +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(legend.position = 'none') + labs(x = 'Name of the actor / director')
netflix %>%
count(release_year) %>%
arrange(desc(n)) %>%
ggplot() +
geom_col(aes(x = release_year, y = n,color=n))
#Where do the majority of movies available on Netflix come from?
sum(is.na(netflix$country))
## [1] 507
# split the column of countries at the comma and then unest it into multiple rows
df_country <- netflix %>%
filter(country != 'NA') %>%
mutate(country = strsplit(as.character(country), ",")) %>%
unnest(country) %>%
mutate(country = trimws(country, which = c("left"))) #eliminate space on the left side
df_country <- df_country %>%
group_by(country)%>%
add_tally()
df_country <- df_country%>%
select(country,n,type) %>%
unique()
df_country_top5 <- df_country[order(-df_country$n),]
df_country_top5 <- df_country_top5[1:35,]
df_country_top5
## # A tibble: 35 x 3
## # Groups: country [18]
## country n type
## <chr> <int> <fct>
## 1 United States 3297 Movie
## 2 United States 3297 TV Show
## 3 India 990 Movie
## 4 India 990 TV Show
## 5 United Kingdom 723 Movie
## 6 United Kingdom 723 TV Show
## 7 Canada 412 Movie
## 8 Canada 412 TV Show
## 9 France 349 Movie
## 10 France 349 TV Show
## # … with 25 more rows
ggplot(df_country_top5, aes(x = reorder(country, n), y = n, fill = type))+
geom_bar(stat = "identity")+
coord_flip()+
theme_classic()+
theme(axis.title.x = element_blank(),
axis.title.y = element_blank())+
labs(title="Content available per country", x = "Amount of content")
##What are the most frequent words used in movie titles?
library(tokenizers)
library(wordcloud)
## Loading required package: RColorBrewer
#
# tot_title <- paste(netflix[,3],collapse=" ")
# tot_title_words <- tokenize_words(tot_title)
# words.freq<-table(unlist(tot_title_words))
#
# result <- cbind.data.frame(words = names(words.freq),amount = as.integer(words.freq)) ## You might consider using cbind.data.frame instead of cbind
#
# result_dec <- result[order(-result$amount),]
#
# result_dec_filter <- result_dec %>%
# filter(nchar( as.character(words)) > 3)
#
# wordcloud(words = result_dec_filter$word, freq = result_dec_filter$amount, min.freq = 1, max.words=150, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Set2"))
library(tokenizers)
library(wordcloud)
library(tidytext)
desc_words_m <- netflix %>% select(type, show_id, description) %>%
filter(type == "Movie") %>%
unnest_tokens(word, description) %>%
anti_join(stop_words)
## Joining, by = "word"
count_word <- desc_words_m %>% count(word, sort = TRUE)
wordcloud(words = count_word$word,
freq = count_word$n,
min.freq = 55,
max.words = nrow(count_word),
random.order = FALSE,
rot.per = 0.1,
colors = brewer.pal(8, "Dark2"))
#Life, Women, Love, Friends, Family, Home, world
desc_words_tv <- netflix %>% select(type, show_id, description) %>%
filter(type == "TV Show") %>%
unnest_tokens(word, description) %>%
anti_join(stop_words)
## Joining, by = "word"
count_word <- desc_words_tv %>%
count(word, sort = TRUE)
wordcloud(words = count_word$word,
freq = count_word$n,
min.freq = 30,
max.words = nrow(count_word),
random.order = FALSE,
rot.per = 0.1,
colors = brewer.pal(8, "Dark2"))
#World, Life, Love, Lives, Friends, Family, School are the most frequent words
# titles=str_flatten(netflix[,3],collapse = '')
# titles_split=tokenize_words(titles)
# word_freq=table(unlist(titles_split))
#
# result=cbind.data.frame(words=names(word_freq),
# word_count=as.integer(word_freq))
# result_top=result%>%arrange(desc(word_count))%>%
# filter(nchar(as.character(words))>3)
#
# cloud=wordcloud(words = result_top$words,freq = result_top$word_count,min.freq = 1,
# max.words = 200,random.order = F,
# rot.per =0.35, colors=brewer.pal(8,"Dark2"))
##Rating by Type
r<-netflix %>% select(rating, type) %>%
filter(!is.na(rating)) %>%
mutate(rating = fct_lump(rating, 5)) %>%
group_by(rating, type) %>%
summarise(Count = n()) %>%
arrange(desc(Count))
## `summarise()` has grouped output by 'rating'. You can override using the `.groups` argument.
r %>% filter(type=='Movie')
## # A tibble: 6 x 3
## # Groups: rating [6]
## rating type Count
## <fct> <fct> <int>
## 1 TV-MA Movie 1845
## 2 TV-14 Movie 1272
## 3 Other Movie 701
## 4 R Movie 663
## 5 TV-PG Movie 505
## 6 PG-13 Movie 386
r %>% filter(type=='TV Show')
## # A tibble: 5 x 3
## # Groups: rating [5]
## rating type Count
## <fct> <fct> <int>
## 1 TV-MA TV Show 1018
## 2 TV-14 TV Show 659
## 3 Other TV Show 428
## 4 TV-PG TV Show 301
## 5 R TV Show 2
r %>%
ggplot(aes(x = type, y = Count, fill = rating)) +
geom_bar(stat = 'identity')
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
netflix %>% select(rating, type) %>%
filter(!is.na(rating)) %>%
mutate(rating = fct_lump(rating, 5)) %>%
group_by(rating, type) %>%
summarise(Count = n()) %>%
arrange(Count) %>%
plot_ly(
x = ~ type ,
y = ~ Count,
type = "bar",
color = ~ rating,
text = ~ Count,
textposition = 'outside',
textfont = list(color = '#000000', size = 12)
) %>%
layout(yaxis = list(categoryorder = "array",
categoryarray = ~ Count)) %>%
layout(
title = "Rating by Type",
yaxis = list(title = "Type"),
xaxis = list(title = "Count"),
legend = list(title = list(text = '<b> Rating </b>'))
)
## `summarise()` has grouped output by 'rating'. You can override using the `.groups` argument.
k <- strsplit(netflix$country, split = ", ")
netds_countries<- data.frame(type = rep(netflix$type, sapply(k, length)), country = unlist(k))
netds_countries$country <- as.character(netds_countries$country)
amount_by_country <- na.omit(netds_countries) %>%
group_by(country, type) %>%
summarise(count = n())
## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
u <- reshape(data=data.frame(amount_by_country),idvar="country",
v.names = "count",
timevar = "type",
direction="wide") %>% arrange(desc(count.Movie)) %>%
top_n(10)
## Selecting by count.TV Show
names(u)[2] <- "Number_of_Movies"
names(u)[3] <- "Number_of_TV_Shows"
u <- u[order(desc(u$Number_of_Movies +u$Number_of_TV_Shows)),]
library(ggplot2)
figure000 <- ggplot(u, aes(Number_of_Movies, Number_of_TV_Shows, colour=country))+
geom_point(size=5)+
xlab("Number of Movies") + ylab("Number of TV Shows")+
ggtitle("Amount of Netflix Content By Top 10 Country")
figure000